Read data

plasma = read.table("Plasma.txt", header = T)

Exploratory Data Analysis

sapply(plasma, class)
##         AGE         SEX    SMOKSTAT    QUETELET      VITUSE    CALORIES 
##   "integer" "character" "character"   "numeric" "character"   "numeric" 
##         FAT       FIBER     ALCOHOL CHOLESTEROL    BETADIET     RETDIET 
##   "numeric"   "numeric"   "numeric"   "numeric"   "integer"   "integer" 
##  BETAPLASMA   RETPLASMA 
##   "integer"   "integer"
sapply(plasma, function(x) sum(is.na(x)))
##         AGE         SEX    SMOKSTAT    QUETELET      VITUSE    CALORIES 
##           0           0           0           0           0           0 
##         FAT       FIBER     ALCOHOL CHOLESTEROL    BETADIET     RETDIET 
##           0           0           0           0           0           0 
##  BETAPLASMA   RETPLASMA 
##           0           0

Quantitative

get data with only quantitative variables

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
quant_plasma <- plasma %>%
  select_if(function(col) is.integer(col) |
                          is.numeric(col))

summary statistics

summary(quant_plasma)
##       AGE           QUETELET        CALORIES           FAT        
##  Min.   :19.00   Min.   :16.33   Min.   : 445.2   Min.   : 14.40  
##  1st Qu.:39.00   1st Qu.:21.80   1st Qu.:1338.0   1st Qu.: 53.95  
##  Median :48.00   Median :24.74   Median :1666.8   Median : 72.90  
##  Mean   :50.15   Mean   :26.16   Mean   :1796.7   Mean   : 77.03  
##  3rd Qu.:62.50   3rd Qu.:28.85   3rd Qu.:2100.4   3rd Qu.: 95.25  
##  Max.   :83.00   Max.   :50.40   Max.   :6662.2   Max.   :235.90  
##      FIBER          ALCOHOL         CHOLESTEROL       BETADIET   
##  Min.   : 3.10   Min.   :  0.000   Min.   : 37.7   Min.   : 214  
##  1st Qu.: 9.15   1st Qu.:  0.000   1st Qu.:155.0   1st Qu.:1116  
##  Median :12.10   Median :  0.300   Median :206.3   Median :1802  
##  Mean   :12.79   Mean   :  3.279   Mean   :242.5   Mean   :2186  
##  3rd Qu.:15.60   3rd Qu.:  3.200   3rd Qu.:308.9   3rd Qu.:2836  
##  Max.   :36.80   Max.   :203.000   Max.   :900.7   Max.   :9642  
##     RETDIET         BETAPLASMA       RETPLASMA     
##  Min.   :  30.0   Min.   :   0.0   Min.   : 179.0  
##  1st Qu.: 480.0   1st Qu.:  90.0   1st Qu.: 466.0  
##  Median : 707.0   Median : 140.0   Median : 566.0  
##  Mean   : 832.7   Mean   : 189.9   Mean   : 602.8  
##  3rd Qu.:1037.0   3rd Qu.: 230.0   3rd Qu.: 716.0  
##  Max.   :6901.0   Max.   :1415.0   Max.   :1727.0

histogram

par(mfrow = c(2, 2))
for(i in 1:11){
  hist(quant_plasma[, i], main=paste("Histogram of", names(quant_plasma)[i]))
}

boxplot

par(mfrow=c(1,3))
for (i in 1:length(quant_plasma)) {
        boxplot(quant_plasma[,i], main=names(quant_plasma[i]), type="l")
}

scatter plot matrix (with correlation)

panel.cor <- function(x, y) {
    par(usr = c(0, 1, 0, 1))
    r <- round(cor(x, y, use = "complete.obs"), 2)
    txt <- paste0("R = ", r)
    cex.cor <- 0.8/strwidth(txt)
    text(0.5, 0.5, txt, cex = cex.cor * (abs(r) + 1))
}

pairs(quant_plasma, lower.panel = panel.cor)

### scatter plot matrix without extreme cases (with correlation)

rm1 <- which(quant_plasma$ALCOHOL == max(quant_plasma$ALCOHOL))
quant_plasma_rm <- quant_plasma[-rm1,]
rm2 <- which(quant_plasma_rm$RETDIET == max(quant_plasma_rm$RETDIET))
quant_plasma_rm <- quant_plasma_rm[-rm2,]
panel.cor <- function(x, y) {
    par(usr = c(0, 1, 0, 1))
    r <- round(cor(x, y, use = "complete.obs"), 2)
    txt <- paste0("R = ", r)
    cex.cor <- 0.8/strwidth(txt)
    text(0.5, 0.5, txt, cex = cex.cor * (abs(r) + 1))
}

pairs(quant_plasma_rm, lower.panel = panel.cor)

pairwise correlations

cor(quant_plasma)
##                      AGE     QUETELET     CALORIES         FAT       FIBER
## AGE          1.000000000 -0.017463821 -0.176769399 -0.16947981  0.04485175
## QUETELET    -0.017463821  1.000000000  0.003526964  0.04875033 -0.08762333
## CALORIES    -0.176769399  0.003526964  1.000000000  0.87184150  0.46548077
## FAT         -0.169479813  0.048750329  0.871841504  1.00000000  0.27648356
## FIBER        0.044851750 -0.087623326  0.465480772  0.27648356  1.00000000
## ALCOHOL      0.051583268 -0.072695431  0.451469804  0.18574310 -0.02011748
## CHOLESTEROL -0.113605966  0.110257245  0.659175452  0.70984794  0.15396838
## BETADIET     0.071869247 -0.006603005  0.243376823  0.14342785  0.48264371
## RETDIET     -0.009610795  0.032055785  0.402491661  0.41221481  0.21461162
## BETAPLASMA   0.101127650 -0.229387366 -0.022206957 -0.09164659  0.23595358
## RETPLASMA    0.211671702  0.013138652 -0.073328605 -0.09093779 -0.04443071
##                 ALCOHOL CHOLESTEROL     BETADIET      RETDIET  BETAPLASMA
## AGE          0.05158327 -0.11360597  0.071869247 -0.009610795  0.10112765
## QUETELET    -0.07269543  0.11025724 -0.006603005  0.032055785 -0.22938737
## CALORIES     0.45146980  0.65917545  0.243376823  0.402491661 -0.02220696
## FAT          0.18574310  0.70984794  0.143427853  0.412214814 -0.09164659
## FIBER       -0.02011748  0.15396838  0.482643706  0.214611616  0.23595358
## ALCOHOL      1.00000000  0.18226398  0.039426478  0.044946841 -0.02221084
## CHOLESTEROL  0.18226398  1.00000000  0.115634801  0.443439304 -0.13030501
## BETADIET     0.03942648  0.11563480  1.000000000  0.052866904  0.22477951
## RETDIET      0.04494684  0.44343930  0.052866904  1.000000000 -0.04613524
## BETAPLASMA  -0.02221084 -0.13030501  0.224779514 -0.046135240  1.00000000
## RETPLASMA    0.01713633 -0.07020134 -0.013539420 -0.062802201  0.07157724
##               RETPLASMA
## AGE          0.21167170
## QUETELET     0.01313865
## CALORIES    -0.07332861
## FAT         -0.09093779
## FIBER       -0.04443071
## ALCOHOL      0.01713633
## CHOLESTEROL -0.07020134
## BETADIET    -0.01353942
## RETDIET     -0.06280220
## BETAPLASMA   0.07157724
## RETPLASMA    1.00000000
r <- cor(quant_plasma)["BETAPLASMA","RETPLASMA"]
r
## [1] 0.07157724
n.q <- nrow(quant_plasma)
#Hypothesis testing for correlation among betaplasma and retplasma
#Null hypothesis : H0: r = 0
#Alternative hypothesis: H1: r not equal to 0
t_statistic <- (r*sqrt(n.q-2))/(sqrt(1-r^2))
t_value <- qt(1-0.05/2, n.q-2)
t_statistic > t_value
## [1] FALSE
#Discision Rule: when t_statistics > t_value = 1.967572, we reject null hypothesis.
#Conclusion: there is not enough evidence to conclude that under significance level 0.05 there is a linear relationship between BETAPLASMA and RETPLASMA
as.data.frame(cor(quant_plasma))
##                      AGE     QUETELET     CALORIES         FAT       FIBER
## AGE          1.000000000 -0.017463821 -0.176769399 -0.16947981  0.04485175
## QUETELET    -0.017463821  1.000000000  0.003526964  0.04875033 -0.08762333
## CALORIES    -0.176769399  0.003526964  1.000000000  0.87184150  0.46548077
## FAT         -0.169479813  0.048750329  0.871841504  1.00000000  0.27648356
## FIBER        0.044851750 -0.087623326  0.465480772  0.27648356  1.00000000
## ALCOHOL      0.051583268 -0.072695431  0.451469804  0.18574310 -0.02011748
## CHOLESTEROL -0.113605966  0.110257245  0.659175452  0.70984794  0.15396838
## BETADIET     0.071869247 -0.006603005  0.243376823  0.14342785  0.48264371
## RETDIET     -0.009610795  0.032055785  0.402491661  0.41221481  0.21461162
## BETAPLASMA   0.101127650 -0.229387366 -0.022206957 -0.09164659  0.23595358
## RETPLASMA    0.211671702  0.013138652 -0.073328605 -0.09093779 -0.04443071
##                 ALCOHOL CHOLESTEROL     BETADIET      RETDIET  BETAPLASMA
## AGE          0.05158327 -0.11360597  0.071869247 -0.009610795  0.10112765
## QUETELET    -0.07269543  0.11025724 -0.006603005  0.032055785 -0.22938737
## CALORIES     0.45146980  0.65917545  0.243376823  0.402491661 -0.02220696
## FAT          0.18574310  0.70984794  0.143427853  0.412214814 -0.09164659
## FIBER       -0.02011748  0.15396838  0.482643706  0.214611616  0.23595358
## ALCOHOL      1.00000000  0.18226398  0.039426478  0.044946841 -0.02221084
## CHOLESTEROL  0.18226398  1.00000000  0.115634801  0.443439304 -0.13030501
## BETADIET     0.03942648  0.11563480  1.000000000  0.052866904  0.22477951
## RETDIET      0.04494684  0.44343930  0.052866904  1.000000000 -0.04613524
## BETAPLASMA  -0.02221084 -0.13030501  0.224779514 -0.046135240  1.00000000
## RETPLASMA    0.01713633 -0.07020134 -0.013539420 -0.062802201  0.07157724
##               RETPLASMA
## AGE          0.21167170
## QUETELET     0.01313865
## CALORIES    -0.07332861
## FAT         -0.09093779
## FIBER       -0.04443071
## ALCOHOL      0.01713633
## CHOLESTEROL -0.07020134
## BETADIET    -0.01353942
## RETDIET     -0.06280220
## BETAPLASMA   0.07157724
## RETPLASMA    1.00000000

Qualitative

plasma$SEX <- as.factor(plasma$SEX)
plasma$SMOKSTAT <- as.factor(plasma$SMOKSTAT)
plasma$VITUSE <- as.factor(plasma$VITUSE)
sapply(plasma, class)
##         AGE         SEX    SMOKSTAT    QUETELET      VITUSE    CALORIES 
##   "integer"    "factor"    "factor"   "numeric"    "factor"   "numeric" 
##         FAT       FIBER     ALCOHOL CHOLESTEROL    BETADIET     RETDIET 
##   "numeric"   "numeric"   "numeric"   "numeric"   "integer"   "integer" 
##  BETAPLASMA   RETPLASMA 
##   "integer"   "integer"
levels(plasma$SEX)
## [1] "FEMALE" "MALE"
levels(plasma$SMOKSTAT)
## [1] "CURRENT" "FORMER"  "NEVER"
levels(plasma$VITUSE)
## [1] "NO"        "NOT OFTEN" "OFTEN"

barplot

par(mfrow = c(2,2))
table.sex <- table(plasma$SEX)
table.smok <- table(plasma$SMOKSTAT)
table.vit <- table(plasma$VITUSE)

barplot(table.sex, main = "Sex: bar chart")
barplot(table.smok, main = "Smokestat: bar chart")
barplot(table.vit, main = "Vituse: bar chart")

piechart

par(mfrow = c(2,2))

level.sex <- c("female", "male")
pct.sex <- round(table.sex/nrow(plasma)*100)
label.sex <- paste(paste(level.sex, pct.sex),"%")
pie(table.sex, labels = label.sex, col = c(1,2), main = "Sex: pie chart with percentage")

level.smok <- c("current", "former", "never")
pct.smok <- round(table.smok/nrow(plasma)*100)
label.smok <- paste(paste(level.smok, pct.smok),"%") 
pie(table.smok, labels = label.smok, col = c(4,5,6), main = "Smokstat: pie chart with percentage")

level.vit <- c("no", "not often", "often")
pct.vit <- round(table.vit/nrow(plasma)*100)
label.vit <- paste(paste(level.vit, pct.vit),"%")
pie(table.vit, labels = label.vit, col = c(7,8,9), main = "Vituse: pie chart with percentage")

side by side boxplot by sex, smokestatus, and vitamin use level

par(mfrow = c(2,3))
ctgr.name <- c("SEX","SMOKSTAT","VITUSE")
response.name <- c("BETAPLASMA", "RETPLASMA")

for (i in ctgr.name){
  for (j in response.name){
    m <- paste(j,":","side-by-side box plot by",i,"level")
    
    boxplot(plasma[,j]~plasma[,i], main = m, xlab = i, ylab = j)
  }
}

Betaplasma

Preliminary Model Investigation

transformation to Betaplasma

rm <- which(plasma$BETAPLASMA <= 0)  #only 1 observation
plasma_rm <- plasma[-rm,] #Boxcox procedure can only deal with positive response variable
beta_plasma_rm <- plasma_rm[,-14]

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
# ?if we should put subjective model here
fit.pre <- lm(BETAPLASMA~., data = beta_plasma_rm)
summary(fit.pre)
## 
## Call:
## lm(formula = BETAPLASMA ~ ., data = beta_plasma_rm)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -257.90  -88.68  -28.05   38.53 1071.60 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     181.639774  68.252089   2.661 0.008204 ** 
## AGE               0.718567   0.748057   0.961 0.337541    
## SEXMALE         -33.852475  31.807026  -1.064 0.288048    
## SMOKSTATFORMER   41.532555  31.195470   1.331 0.184083    
## SMOKSTATNEVER    47.547121  30.976114   1.535 0.125851    
## QUETELET         -5.929232   1.632950  -3.631 0.000332 ***
## VITUSENOT OFTEN  43.916308  25.323858   1.734 0.083916 .  
## VITUSEOFTEN      79.267745  23.245584   3.410 0.000739 ***
## CALORIES         -0.030255   0.051274  -0.590 0.555600    
## FAT               0.101812   0.805645   0.126 0.899522    
## FIBER             6.378290   2.830550   2.253 0.024960 *  
## ALCOHOL           0.981143   1.242056   0.790 0.430192    
## CHOLESTEROL      -0.064525   0.113552  -0.568 0.570297    
## BETADIET          0.015840   0.007510   2.109 0.035755 *  
## RETDIET          -0.007303   0.018724  -0.390 0.696784    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 168.2 on 299 degrees of freedom
## Multiple R-squared:  0.1928, Adjusted R-squared:  0.155 
## F-statistic:   5.1 on 14 and 299 DF,  p-value: 1.363e-08
par(mfrow=c(2,2))
plot(fit.pre, which=c(1,2))
boxcox(fit.pre)#lambda = 0,indicating log transformation

hist(log(plasma$BETAPLASMA), xlab ="log(BETAPLASMA)", main = "Histogram of log(BETAPLASMA)")

fit.pre.trans <- lm(log(BETAPLASMA)~., data = beta_plasma_rm) #try log transformation on BETAPLASMA
plot(fit.pre.trans, which=c(1,2))

fit full model

set.seed(1024)
beta_plasma <- plasma[,-14]
index <- sample(1:315, ceiling(315*0.8), replace = FALSE)
beta_plasma_t <- beta_plasma[index,] #training data
beta_plasma_v <- beta_plasma[-index,] #validation data 
beta_plasma_t_full<- lm(log(BETAPLASMA+1)~., data=beta_plasma_t)
summary(beta_plasma_t_full)
## 
## Call:
## lm(formula = log(BETAPLASMA + 1) ~ ., data = beta_plasma_t)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.05512 -0.36039  0.03189  0.38717  1.80650 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.215e+00  2.819e-01  18.499  < 2e-16 ***
## AGE              4.355e-03  3.170e-03   1.374  0.17076    
## SEXMALE         -3.120e-01  1.349e-01  -2.313  0.02160 *  
## SMOKSTATFORMER   1.274e-01  1.325e-01   0.962  0.33720    
## SMOKSTATNEVER    2.526e-01  1.299e-01   1.945  0.05296 .  
## QUETELET        -3.209e-02  6.864e-03  -4.675 4.95e-06 ***
## VITUSENOT OFTEN  2.611e-01  1.089e-01   2.397  0.01731 *  
## VITUSEOFTEN      2.981e-01  9.890e-02   3.015  0.00285 ** 
## CALORIES        -1.613e-04  2.158e-04  -0.747  0.45552    
## FAT             -9.856e-04  3.416e-03  -0.289  0.77322    
## FIBER            1.720e-02  1.211e-02   1.420  0.15695    
## ALCOHOL          4.347e-03  5.108e-03   0.851  0.39563    
## CHOLESTEROL     -1.227e-04  4.661e-04  -0.263  0.79255    
## BETADIET         8.864e-05  3.298e-05   2.688  0.00770 ** 
## RETDIET          6.235e-05  7.597e-05   0.821  0.41264    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6476 on 237 degrees of freedom
## Multiple R-squared:  0.2675, Adjusted R-squared:  0.2243 
## F-statistic: 6.183 on 14 and 237 DF,  p-value: 1.837e-10
par(mfrow=c(2,2))
plot(beta_plasma_t_full)

Model Selection

Model selection approach 1: best subset

First order model

library(leaps)
subset <- regsubsets(log(BETAPLASMA+1)~., data = beta_plasma_t, nvmax = 12, method ="exhaustive")
sum_sub <- summary(subset)

n.t <- nrow(beta_plasma_t)
p <- rowSums(sum_sub$which)
SSEp <- sum_sub$rss
R2 <- sum_sub$rsq
Ra2 <- sum_sub$adjr2
Cp <- sum_sub$cp
AIC <- n.t*log(SSEp/n.t)+2*p
BIC <-n.t*log(SSEp/n.t)+log(n.t)*p
criteria <- cbind(sum_sub$which, SSEp, R2, Ra2, Cp, AIC, BIC)

#add the criteria of null model
beta_plasma_t_null<- lm(log(BETAPLASMA+1)~1, data = beta_plasma_t)
sse0 <- anova(beta_plasma_t_null)["Residuals",2]
r0 <- summary(beta_plasma_t_null)$r.squared
ra0 <- summary(beta_plasma_t_null)$adj.r.squared
p0 <- 1
cp0 <- (sse0/summary(beta_plasma_t_full)$sigma^2)-(n.t-2*p0)
aic0 <- n.t*log(sse0/n.t)+2*p0
bic0 <- n.t*log(sse0/n.t)+log(n.t)*p0
null <- c(1,rep(0,14),sse0,r0,ra0,cp0,aic0,bic0)
criteria <- rbind(null, criteria)
criteria <- as.data.frame(criteria)
criteria
##      (Intercept) AGE SEXMALE SMOKSTATFORMER SMOKSTATNEVER QUETELET
## null           1   0       0              0             0        0
## 1              1   0       0              0             0        1
## 2              1   0       0              0             0        1
## 3              1   0       0              0             0        1
## 4              1   0       0              0             1        1
## 5              1   0       0              0             0        1
## 6              1   0       0              0             1        1
## 7              1   0       1              0             1        1
## 8              1   1       1              0             1        1
## 9              1   1       1              0             1        1
## 10             1   1       1              1             1        1
## 11             1   1       1              1             1        1
## 12             1   1       1              1             1        1
##      VITUSENOT OFTEN VITUSEOFTEN CALORIES FAT FIBER ALCOHOL CHOLESTEROL
## null               0           0        0   0     0       0           0
## 1                  0           0        0   0     0       0           0
## 2                  0           0        0   0     0       0           0
## 3                  0           0        0   1     0       0           0
## 4                  0           0        0   1     0       0           0
## 5                  1           1        0   1     0       0           0
## 6                  1           1        0   1     0       0           0
## 7                  1           1        0   1     0       0           0
## 8                  1           1        0   1     0       0           0
## 9                  1           1        0   1     1       0           0
## 10                 1           1        0   1     1       0           0
## 11                 1           1        1   0     1       1           0
## 12                 1           1        1   0     1       1           0
##      BETADIET RETDIET      SSEp         R2        Ra2        Cp       AIC
## null        0       0 135.68773 0.00000000 0.00000000 73.568179 -154.0064
## 1           0       0 125.38785 0.07590872 0.07221235 51.006533 -171.9004
## 2           1       0 118.43776 0.12712991 0.12011890 36.432985 -184.2705
## 3           1       0 112.92261 0.16777581 0.15770859 25.281264 -194.2870
## 4           1       0 109.57139 0.19247383 0.17939648 19.289774 -199.8789
## 5           1       0 106.70709 0.21358333 0.19759925 14.459411 -204.5540
## 6           1       0 104.21510 0.23194898 0.21313957 10.516869 -208.5090
## 7           1       0 102.77856 0.24253608 0.22080556  9.091220 -210.0068
## 8           1       0 101.21785 0.25403827 0.22947986  7.369479 -211.8628
## 9           1       0 100.34463 0.26047383 0.23297079  7.287136 -212.0463
## 10          1       0  99.96620 0.26326281 0.23269280  8.384711 -210.9984
## 11          1       0  99.70535 0.26518522 0.23150621  9.762679 -209.6569
## 12          1       1  99.45994 0.26699386 0.23019021 11.177461 -208.2779
##            BIC
## null -150.4770
## 1    -164.8415
## 2    -173.6822
## 3    -180.1693
## 4    -182.2317
## 5    -183.3775
## 6    -183.8030
## 7    -181.7714
## 8    -180.0979
## 9    -176.7520
## 10   -172.1747
## 11   -167.3037
## 12   -162.3953
which.best.sub <- data.frame(
  Ra2 = which.max(criteria$Ra2),
  Cp = which.min(criteria$Cp),
  AIC = which.min(criteria$AIC),
  BIC = which.min(criteria$BIC)
)
which.best.sub
##   Ra2 Cp AIC BIC
## 1  10 10  10   7
rbind(criteria[10,],criteria[7,])
##   (Intercept) AGE SEXMALE SMOKSTATFORMER SMOKSTATNEVER QUETELET VITUSENOT OFTEN
## 9           1   1       1              0             1        1               1
## 6           1   0       0              0             1        1               1
##   VITUSEOFTEN CALORIES FAT FIBER ALCOHOL CHOLESTEROL BETADIET RETDIET     SSEp
## 9           1        0   1     1       0           0        1       0 100.3446
## 6           1        0   1     0       0           0        1       0 104.2151
##          R2       Ra2        Cp       AIC      BIC
## 9 0.2604738 0.2329708  7.287136 -212.0463 -176.752
## 6 0.2319490 0.2131396 10.516869 -208.5090 -183.803

By adjusted r squared, Mallow’s Cp and AIC criterion, the model containing AGE, SEX, SMOKSTAT, QUETELET, VITUSE, FAT, FIBER, BETADIET is the best. By BIC criteria which prefers a smaller model than AIC criteria, the model containing SMOKSTAT, QUETELET, VITUSE, FAT, BETADIET is the best.

fit the result above

#best subset from Cp and AIC and adjusted r square
best.sub1 <- lm(log(BETAPLASMA+1)~AGE+SEX+SMOKSTAT+QUETELET+VITUSE+FAT+FIBER+BETADIET, data = beta_plasma_t)
summary(best.sub1)
## 
## Call:
## lm(formula = log(BETAPLASMA + 1) ~ AGE + SEX + SMOKSTAT + QUETELET + 
##     VITUSE + FAT + FIBER + BETADIET, data = beta_plasma_t)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.99316 -0.35874  0.02324  0.39915  1.81732 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.171e+00  2.742e-01  18.862  < 2e-16 ***
## AGE              5.285e-03  3.026e-03   1.747  0.08199 .  
## SEXMALE         -3.162e-01  1.290e-01  -2.451  0.01495 *  
## SMOKSTATFORMER   1.248e-01  1.307e-01   0.955  0.34046    
## SMOKSTATNEVER    2.537e-01  1.279e-01   1.983  0.04846 *  
## QUETELET        -3.286e-02  6.756e-03  -4.863 2.08e-06 ***
## VITUSENOT OFTEN  2.477e-01  1.071e-01   2.313  0.02159 *  
## VITUSEOFTEN      2.807e-01  9.676e-02   2.901  0.00406 ** 
## FAT             -3.135e-03  1.309e-03  -2.395  0.01739 *  
## FIBER            1.280e-02  9.614e-03   1.331  0.18437    
## BETADIET         8.782e-05  3.247e-05   2.704  0.00733 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.644 on 241 degrees of freedom
## Multiple R-squared:  0.2633, Adjusted R-squared:  0.2327 
## F-statistic: 8.612 on 10 and 241 DF,  p-value: 5.164e-12
par(mfrow=c(2,2))
plot(best.sub1)

#best subset from BIC
best.sub2 <- lm(log(BETAPLASMA+1)~SMOKSTAT+QUETELET+VITUSE+FAT+BETADIET, data = beta_plasma_t)
summary(best.sub2)
## 
## Call:
## lm(formula = log(BETAPLASMA + 1) ~ SMOKSTAT + QUETELET + VITUSE + 
##     FAT + BETADIET, data = beta_plasma_t)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0236 -0.3646  0.0149  0.4152  1.7755 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.4894529  0.2229717  24.620  < 2e-16 ***
## SMOKSTATFORMER   0.1688395  0.1295208   1.304 0.193609    
## SMOKSTATNEVER    0.3262964  0.1258846   2.592 0.010117 *  
## QUETELET        -0.0336699  0.0067838  -4.963  1.3e-06 ***
## VITUSENOT OFTEN  0.2669619  0.1058477   2.522 0.012302 *  
## VITUSEOFTEN      0.3145361  0.0969823   3.243 0.001347 ** 
## FAT             -0.0034961  0.0012253  -2.853 0.004700 ** 
## BETADIET         0.0001100  0.0000288   3.821 0.000169 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6513 on 244 degrees of freedom
## Multiple R-squared:  0.2373, Adjusted R-squared:  0.2154 
## F-statistic: 10.84 on 7 and 244 DF,  p-value: 6.703e-12
par(mfrow=c(2,2))
plot(best.sub2)

Model selection approach 2: Stepwise model selection

First order model

library(MASS)
step.sub1 <- stepAIC(beta_plasma_t_null, scope = list(upper=beta_plasma_t_full, lower=~1), direction = "both", k = 2, trace = FALSE)

step.model1 <- lm(log(BETAPLASMA+1) ~ QUETELET + VITUSE + BETADIET + 
    FAT + SMOKSTAT + SEX + AGE, data = beta_plasma_t)
summary(step.model1) 
## 
## Call:
## lm(formula = log(BETAPLASMA + 1) ~ QUETELET + VITUSE + BETADIET + 
##     FAT + SMOKSTAT + SEX + AGE, data = beta_plasma_t)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.95934 -0.34445  0.03701  0.39190  1.76567 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.254e+00  2.675e-01  19.639  < 2e-16 ***
## QUETELET        -3.391e-02  6.720e-03  -5.046 8.85e-07 ***
## VITUSENOT OFTEN  2.542e-01  1.072e-01   2.372 0.018497 *  
## VITUSEOFTEN      2.866e-01  9.682e-02   2.960 0.003382 ** 
## BETADIET         1.085e-04  2.855e-05   3.802 0.000182 ***
## FAT             -2.656e-03  1.260e-03  -2.107 0.036150 *  
## SMOKSTATFORMER   1.449e-01  1.300e-01   1.114 0.266318    
## SMOKSTATNEVER    2.805e-01  1.265e-01   2.217 0.027533 *  
## SEXMALE         -3.072e-01  1.290e-01  -2.381 0.018028 *  
## AGE              5.266e-03  3.031e-03   1.737 0.083572 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6451 on 242 degrees of freedom
## Multiple R-squared:  0.2578, Adjusted R-squared:  0.2302 
## F-statistic: 9.342 on 9 and 242 DF,  p-value: 3.615e-12

Second order model

beta_plasma_t_full2 <- lm(log(BETAPLASMA+1)~.^2, data = beta_plasma_t)
step.sub2 <- stepAIC(beta_plasma_t_null, scope = list(upper=beta_plasma_t_full2, lower=~1), direction = "both", k = 2, trace = FALSE)

step.model2 <- lm(log(BETAPLASMA +1) ~ CHOLESTEROL + FIBER + QUETELET + VITUSE + BETADIET + AGE + RETDIET + SMOKSTAT + VITUSE:BETADIET + CHOLESTEROL:RETDIET + AGE:RETDIET + VITUSE:SMOKSTAT + RETDIET:SMOKSTAT + CHOLESTEROL:QUETELET + BETADIET:AGE + FIBER:SMOKSTAT + CHOLESTEROL:FIBER, data = beta_plasma_t)

summary(step.model2) 
## 
## Call:
## lm(formula = log(BETAPLASMA + 1) ~ CHOLESTEROL + FIBER + QUETELET + 
##     VITUSE + BETADIET + AGE + RETDIET + SMOKSTAT + VITUSE:BETADIET + 
##     CHOLESTEROL:RETDIET + AGE:RETDIET + VITUSE:SMOKSTAT + RETDIET:SMOKSTAT + 
##     CHOLESTEROL:QUETELET + BETADIET:AGE + FIBER:SMOKSTAT + CHOLESTEROL:FIBER, 
##     data = beta_plasma_t)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.97244 -0.33469  0.01207  0.36536  1.66268 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     6.133e+00  5.947e-01  10.313  < 2e-16 ***
## CHOLESTEROL                    -2.140e-03  1.496e-03  -1.431  0.15395    
## FIBER                          -2.775e-02  2.923e-02  -0.949  0.34340    
## QUETELET                       -4.421e-02  1.466e-02  -3.016  0.00286 ** 
## VITUSENOT OFTEN                -6.311e-02  2.944e-01  -0.214  0.83045    
## VITUSEOFTEN                    -5.191e-01  2.979e-01  -1.743  0.08277 .  
## BETADIET                        1.576e-04  1.351e-04   1.167  0.24460    
## AGE                             2.126e-03  6.866e-03   0.310  0.75716    
## RETDIET                        -2.040e-04  3.569e-04  -0.572  0.56811    
## SMOKSTATFORMER                 -3.972e-01  3.918e-01  -1.014  0.31169    
## SMOKSTATNEVER                  -3.612e-01  3.663e-01  -0.986  0.32520    
## VITUSENOT OFTEN:BETADIET        2.226e-05  8.075e-05   0.276  0.78303    
## VITUSEOFTEN:BETADIET            1.525e-04  7.083e-05   2.153  0.03236 *  
## CHOLESTEROL:RETDIET            -1.248e-07  4.243e-07  -0.294  0.76888    
## AGE:RETDIET                     8.134e-06  6.120e-06   1.329  0.18517    
## VITUSENOT OFTEN:SMOKSTATFORMER  6.574e-01  3.312e-01   1.985  0.04836 *  
## VITUSEOFTEN:SMOKSTATFORMER      7.090e-01  3.190e-01   2.222  0.02725 *  
## VITUSENOT OFTEN:SMOKSTATNEVER   1.457e-01  3.269e-01   0.446  0.65626    
## VITUSEOFTEN:SMOKSTATNEVER       4.893e-01  3.062e-01   1.598  0.11144    
## RETDIET:SMOKSTATFORMER         -1.812e-04  2.971e-04  -0.610  0.54265    
## RETDIET:SMOKSTATNEVER          -1.026e-04  2.672e-04  -0.384  0.70122    
## CHOLESTEROL:QUETELET            6.264e-05  5.109e-05   1.226  0.22142    
## BETADIET:AGE                   -2.800e-06  2.390e-06  -1.171  0.24272    
## FIBER:SMOKSTATFORMER            3.364e-02  2.828e-02   1.189  0.23551    
## FIBER:SMOKSTATNEVER             5.296e-02  2.668e-02   1.985  0.04837 *  
## CHOLESTEROL:FIBER              -3.074e-05  6.912e-05  -0.445  0.65695    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.642 on 226 degrees of freedom
## Multiple R-squared:  0.3136, Adjusted R-squared:  0.2376 
## F-statistic: 4.129 on 25 and 226 DF,  p-value: 3.895e-09

Model Validation

Internal validation of step.model1 and step.model2

#find SSE for step.model1 and step.model2
sse.bs1 <- anova(best.sub1)["Residuals",2]
sse.bs2 <- anova(best.sub2)["Residuals",2]
sse.sm1 <- anova(step.model1)["Residuals",2]
sse.sm2 <- anova(step.model2)["Residuals",2]
sse.compare <- c(sse.bs1,sse.bs2,sse.sm1,sse.sm2)

#find MSE for step.model1 and step.model2
mse.bs1 <- anova(best.sub1)["Residuals",3]
mse.bs2 <- anova(best.sub2)["Residuals",3]
mse.sm1 <- anova(step.model1)["Residuals",3]
mse.sm2 <- anova(step.model2)["Residuals",3]
mse.compare <- c(mse.bs1,mse.bs2,mse.sm1,mse.sm2)

#find p for step.model1 and step.model2
p.bs1 <-9
p.bs2 <-6
p.sm1 <-7
p.sm2 <-17
p.compare <- c(p.bs1,p.bs2,p.sm1,p.sm2)

#find Cp for step.model1 and step.model2
#sigma^2: MSE_fullmodel(create a fit3 related to step.model1)
#n.t: nrow(training data)

beta_plasma_tt2 <- beta_plasma_t[,c("BETAPLASMA","AGE","SEX","SMOKSTAT","QUETELET","VITUSE","FAT","FIBER","BETADIET")]
Full4 <-  lm(log(BETAPLASMA+1)~.^2, data=beta_plasma_tt2)
mse4 <- anova(Full4)['Residuals',3]

beta_plasma_tt <- beta_plasma_t[,c("BETAPLASMA","QUETELET","VITUSE","CHOLESTEROL","BETADIET","SMOKSTAT","SEX")]
Full3 <- lm(log(BETAPLASMA+1)~.^2, data=beta_plasma_tt)
length(Full3$coef)
## [1] 35
mse3 <- anova(Full3)['Residuals',3]

cp.bs1 <- (sse.bs1/mse4)-(n.t-2*p.bs1)
cp.bs2 <- (sse.bs2/mse4)-(n.t-2*p.bs2)
cp.sm1 <- (sse.sm1/mse3)-(n.t-2*p.sm1)
cp.sm2 <- (sse.sm2/mse3)-(n.t-2*p.sm2)
cp.compare <- c(cp.bs1,cp.bs2,cp.sm1,cp.sm2)

#find Pressp for step.model1 and step.model2
press.bs1 <- sum(best.sub1$residuals^2/(1-influence(best.sub1)$hat)^2)
press.bs2 <- sum(best.sub2$residuals^2/(1-influence(best.sub2)$hat)^2)
press.sm1 <- sum(step.model1$residuals^2/(1-influence(step.model1)$hat)^2)
press.sm2 <- sum(step.model2$residuals^2/(1-influence(step.model2)$hat)^2)
press.compare <- c(press.bs1,press.bs2,press.sm1,press.sm2)

compare <- data.frame(sse=sse.compare, mse=mse.compare, p=p.compare, cp=cp.compare, press=press.compare)
rownames(compare) <- c("best subset 1","best subset 2","stepwise mode1", "stepwise model2")
compare
##                       sse       mse  p        cp    press
## best subset 1    99.96620 0.4147975  9 12.972734 109.5064
## best subset 2   103.49433 0.4241571  6 15.689211 110.4410
## stepwise mode1  100.70131 0.4161211  7  9.511715 109.3472
## stepwise model2  93.14226 0.4121339 17 10.932479 132.0332

External validation

##best subsets, stepwise models on validation data
best.sub1.v <- lm(best.sub1, data = beta_plasma_v)
best.sub2.v <- lm(best.sub2, data = beta_plasma_v)
step.model1.v <- lm(step.model1, data = beta_plasma_v)
step.model2.v <- lm(step.model2, data = beta_plasma_v)

#summary on training data and validation data respectively
list(summary(best.sub1), summary(best.sub2),summary(step.model1),summary(step.model2))
## [[1]]
## 
## Call:
## lm(formula = log(BETAPLASMA + 1) ~ AGE + SEX + SMOKSTAT + QUETELET + 
##     VITUSE + FAT + FIBER + BETADIET, data = beta_plasma_t)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.99316 -0.35874  0.02324  0.39915  1.81732 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.171e+00  2.742e-01  18.862  < 2e-16 ***
## AGE              5.285e-03  3.026e-03   1.747  0.08199 .  
## SEXMALE         -3.162e-01  1.290e-01  -2.451  0.01495 *  
## SMOKSTATFORMER   1.248e-01  1.307e-01   0.955  0.34046    
## SMOKSTATNEVER    2.537e-01  1.279e-01   1.983  0.04846 *  
## QUETELET        -3.286e-02  6.756e-03  -4.863 2.08e-06 ***
## VITUSENOT OFTEN  2.477e-01  1.071e-01   2.313  0.02159 *  
## VITUSEOFTEN      2.807e-01  9.676e-02   2.901  0.00406 ** 
## FAT             -3.135e-03  1.309e-03  -2.395  0.01739 *  
## FIBER            1.280e-02  9.614e-03   1.331  0.18437    
## BETADIET         8.782e-05  3.247e-05   2.704  0.00733 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.644 on 241 degrees of freedom
## Multiple R-squared:  0.2633, Adjusted R-squared:  0.2327 
## F-statistic: 8.612 on 10 and 241 DF,  p-value: 5.164e-12
## 
## 
## [[2]]
## 
## Call:
## lm(formula = log(BETAPLASMA + 1) ~ SMOKSTAT + QUETELET + VITUSE + 
##     FAT + BETADIET, data = beta_plasma_t)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0236 -0.3646  0.0149  0.4152  1.7755 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.4894529  0.2229717  24.620  < 2e-16 ***
## SMOKSTATFORMER   0.1688395  0.1295208   1.304 0.193609    
## SMOKSTATNEVER    0.3262964  0.1258846   2.592 0.010117 *  
## QUETELET        -0.0336699  0.0067838  -4.963  1.3e-06 ***
## VITUSENOT OFTEN  0.2669619  0.1058477   2.522 0.012302 *  
## VITUSEOFTEN      0.3145361  0.0969823   3.243 0.001347 ** 
## FAT             -0.0034961  0.0012253  -2.853 0.004700 ** 
## BETADIET         0.0001100  0.0000288   3.821 0.000169 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6513 on 244 degrees of freedom
## Multiple R-squared:  0.2373, Adjusted R-squared:  0.2154 
## F-statistic: 10.84 on 7 and 244 DF,  p-value: 6.703e-12
## 
## 
## [[3]]
## 
## Call:
## lm(formula = log(BETAPLASMA + 1) ~ QUETELET + VITUSE + BETADIET + 
##     FAT + SMOKSTAT + SEX + AGE, data = beta_plasma_t)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.95934 -0.34445  0.03701  0.39190  1.76567 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.254e+00  2.675e-01  19.639  < 2e-16 ***
## QUETELET        -3.391e-02  6.720e-03  -5.046 8.85e-07 ***
## VITUSENOT OFTEN  2.542e-01  1.072e-01   2.372 0.018497 *  
## VITUSEOFTEN      2.866e-01  9.682e-02   2.960 0.003382 ** 
## BETADIET         1.085e-04  2.855e-05   3.802 0.000182 ***
## FAT             -2.656e-03  1.260e-03  -2.107 0.036150 *  
## SMOKSTATFORMER   1.449e-01  1.300e-01   1.114 0.266318    
## SMOKSTATNEVER    2.805e-01  1.265e-01   2.217 0.027533 *  
## SEXMALE         -3.072e-01  1.290e-01  -2.381 0.018028 *  
## AGE              5.266e-03  3.031e-03   1.737 0.083572 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6451 on 242 degrees of freedom
## Multiple R-squared:  0.2578, Adjusted R-squared:  0.2302 
## F-statistic: 9.342 on 9 and 242 DF,  p-value: 3.615e-12
## 
## 
## [[4]]
## 
## Call:
## lm(formula = log(BETAPLASMA + 1) ~ CHOLESTEROL + FIBER + QUETELET + 
##     VITUSE + BETADIET + AGE + RETDIET + SMOKSTAT + VITUSE:BETADIET + 
##     CHOLESTEROL:RETDIET + AGE:RETDIET + VITUSE:SMOKSTAT + RETDIET:SMOKSTAT + 
##     CHOLESTEROL:QUETELET + BETADIET:AGE + FIBER:SMOKSTAT + CHOLESTEROL:FIBER, 
##     data = beta_plasma_t)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.97244 -0.33469  0.01207  0.36536  1.66268 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     6.133e+00  5.947e-01  10.313  < 2e-16 ***
## CHOLESTEROL                    -2.140e-03  1.496e-03  -1.431  0.15395    
## FIBER                          -2.775e-02  2.923e-02  -0.949  0.34340    
## QUETELET                       -4.421e-02  1.466e-02  -3.016  0.00286 ** 
## VITUSENOT OFTEN                -6.311e-02  2.944e-01  -0.214  0.83045    
## VITUSEOFTEN                    -5.191e-01  2.979e-01  -1.743  0.08277 .  
## BETADIET                        1.576e-04  1.351e-04   1.167  0.24460    
## AGE                             2.126e-03  6.866e-03   0.310  0.75716    
## RETDIET                        -2.040e-04  3.569e-04  -0.572  0.56811    
## SMOKSTATFORMER                 -3.972e-01  3.918e-01  -1.014  0.31169    
## SMOKSTATNEVER                  -3.612e-01  3.663e-01  -0.986  0.32520    
## VITUSENOT OFTEN:BETADIET        2.226e-05  8.075e-05   0.276  0.78303    
## VITUSEOFTEN:BETADIET            1.525e-04  7.083e-05   2.153  0.03236 *  
## CHOLESTEROL:RETDIET            -1.248e-07  4.243e-07  -0.294  0.76888    
## AGE:RETDIET                     8.134e-06  6.120e-06   1.329  0.18517    
## VITUSENOT OFTEN:SMOKSTATFORMER  6.574e-01  3.312e-01   1.985  0.04836 *  
## VITUSEOFTEN:SMOKSTATFORMER      7.090e-01  3.190e-01   2.222  0.02725 *  
## VITUSENOT OFTEN:SMOKSTATNEVER   1.457e-01  3.269e-01   0.446  0.65626    
## VITUSEOFTEN:SMOKSTATNEVER       4.893e-01  3.062e-01   1.598  0.11144    
## RETDIET:SMOKSTATFORMER         -1.812e-04  2.971e-04  -0.610  0.54265    
## RETDIET:SMOKSTATNEVER          -1.026e-04  2.672e-04  -0.384  0.70122    
## CHOLESTEROL:QUETELET            6.264e-05  5.109e-05   1.226  0.22142    
## BETADIET:AGE                   -2.800e-06  2.390e-06  -1.171  0.24272    
## FIBER:SMOKSTATFORMER            3.364e-02  2.828e-02   1.189  0.23551    
## FIBER:SMOKSTATNEVER             5.296e-02  2.668e-02   1.985  0.04837 *  
## CHOLESTEROL:FIBER              -3.074e-05  6.912e-05  -0.445  0.65695    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.642 on 226 degrees of freedom
## Multiple R-squared:  0.3136, Adjusted R-squared:  0.2376 
## F-statistic: 4.129 on 25 and 226 DF,  p-value: 3.895e-09
list(summary(best.sub1.v), summary(best.sub2.v),summary(step.model1.v),summary(step.model2.v))
## [[1]]
## 
## Call:
## lm(formula = best.sub1, data = beta_plasma_v)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6792 -0.3864  0.0216  0.3397  1.8382 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      4.568e+00  1.001e+00   4.566 3.08e-05 ***
## AGE              5.231e-03  1.002e-02   0.522  0.60370    
## SEXMALE          6.092e-01  4.202e-01   1.450  0.15314    
## SMOKSTATFORMER   6.603e-01  4.100e-01   1.610  0.11336    
## SMOKSTATNEVER    3.660e-01  4.149e-01   0.882  0.38186    
## QUETELET        -3.833e-02  2.212e-02  -1.733  0.08904 .  
## VITUSENOT OFTEN  5.585e-01  3.373e-01   1.656  0.10382    
## VITUSEOFTEN      2.727e-01  3.081e-01   0.885  0.38018    
## FAT             -8.900e-03  4.291e-03  -2.074  0.04303 *  
## FIBER            6.452e-02  2.273e-02   2.839  0.00644 ** 
## BETADIET         6.002e-06  8.446e-05   0.071  0.94362    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8669 on 52 degrees of freedom
## Multiple R-squared:  0.3461, Adjusted R-squared:  0.2204 
## F-statistic: 2.753 on 10 and 52 DF,  p-value: 0.008433
## 
## 
## [[2]]
## 
## Call:
## lm(formula = best.sub2, data = beta_plasma_v)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0121 -0.4015  0.0569  0.4081  1.8483 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.604e+00  8.618e-01   6.502 2.46e-08 ***
## SMOKSTATFORMER   7.871e-01  4.376e-01   1.799   0.0776 .  
## SMOKSTATNEVER    6.101e-01  4.373e-01   1.395   0.1685    
## QUETELET        -5.222e-02  2.307e-02  -2.264   0.0276 *  
## VITUSENOT OFTEN  3.463e-01  3.541e-01   0.978   0.3323    
## VITUSEOFTEN      1.479e-01  3.267e-01   0.453   0.6526    
## FAT             -4.788e-03  3.951e-03  -1.212   0.2308    
## BETADIET         7.305e-05  8.459e-05   0.864   0.3915    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.934 on 55 degrees of freedom
## Multiple R-squared:  0.1972, Adjusted R-squared:  0.09503 
## F-statistic:  1.93 on 7 and 55 DF,  p-value: 0.08212
## 
## 
## [[3]]
## 
## Call:
## lm(formula = step.model1, data = beta_plasma_v)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8359 -0.3859 -0.0263  0.4063  1.8168 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      4.911e+00  1.057e+00   4.645 2.29e-05 ***
## QUETELET        -4.886e-02  2.321e-02  -2.105   0.0400 *  
## VITUSENOT OFTEN  4.585e-01  3.571e-01   1.284   0.2048    
## VITUSEOFTEN      1.977e-01  3.267e-01   0.605   0.5476    
## BETADIET         6.001e-05  8.760e-05   0.685   0.4963    
## FAT             -5.466e-03  4.382e-03  -1.247   0.2178    
## SMOKSTATFORMER   7.954e-01  4.335e-01   1.835   0.0721 .  
## SMOKSTATNEVER    6.085e-01  4.323e-01   1.408   0.1651    
## SEXMALE          3.904e-01  4.397e-01   0.888   0.3787    
## AGE              1.223e-02  1.033e-02   1.183   0.2421    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9228 on 53 degrees of freedom
## Multiple R-squared:  0.2448, Adjusted R-squared:  0.1165 
## F-statistic: 1.909 on 9 and 53 DF,  p-value: 0.07065
## 
## 
## [[4]]
## 
## Call:
## lm(formula = step.model2, data = beta_plasma_v)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.84211 -0.30962 -0.03657  0.24799  1.28694 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     5.201e+00  5.805e+00   0.896    0.376    
## CHOLESTEROL                    -5.417e-03  6.561e-03  -0.826    0.414    
## FIBER                          -2.267e-01  4.151e-01  -0.546    0.588    
## QUETELET                       -4.983e-02  3.671e-02  -1.358    0.183    
## VITUSENOT OFTEN                 1.276e+00  1.252e+00   1.019    0.315    
## VITUSEOFTEN                     4.022e-01  1.890e+00   0.213    0.833    
## BETADIET                        9.551e-05  3.281e-04   0.291    0.773    
## AGE                             1.826e-02  2.356e-02   0.775    0.443    
## RETDIET                         2.963e-03  1.868e-03   1.586    0.121    
## SMOKSTATFORMER                  4.174e-01  6.238e+00   0.067    0.947    
## SMOKSTATNEVER                  -4.727e-01  6.188e+00  -0.076    0.940    
## VITUSENOT OFTEN:BETADIET       -2.327e-04  2.004e-04  -1.161    0.253    
## VITUSEOFTEN:BETADIET           -1.060e-04  1.562e-04  -0.679    0.502    
## CHOLESTEROL:RETDIET            -4.268e-06  9.081e-07  -4.701 3.54e-05 ***
## AGE:RETDIET                    -1.716e-05  2.274e-05  -0.755    0.455    
## VITUSENOT OFTEN:SMOKSTATFORMER -3.704e-01  1.275e+00  -0.291    0.773    
## VITUSEOFTEN:SMOKSTATFORMER      3.954e-01  1.936e+00   0.204    0.839    
## VITUSENOT OFTEN:SMOKSTATNEVER  -9.741e-01  1.304e+00  -0.747    0.460    
## VITUSEOFTEN:SMOKSTATNEVER      -2.921e-01  1.920e+00  -0.152    0.880    
## RETDIET:SMOKSTATFORMER         -1.644e-03  1.413e-03  -1.164    0.252    
## RETDIET:SMOKSTATNEVER          -1.064e-03  1.448e-03  -0.734    0.467    
## CHOLESTEROL:QUETELET            1.045e-04  1.677e-04   0.623    0.537    
## BETADIET:AGE                   -1.036e-06  6.708e-06  -0.155    0.878    
## FIBER:SMOKSTATFORMER            1.445e-01  4.223e-01   0.342    0.734    
## FIBER:SMOKSTATNEVER             2.075e-01  4.147e-01   0.500    0.620    
## CHOLESTEROL:FIBER               4.882e-04  3.086e-04   1.582    0.122    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6017 on 37 degrees of freedom
## Multiple R-squared:  0.7758, Adjusted R-squared:  0.6244 
## F-statistic: 5.122 on 25 and 37 DF,  p-value: 4.377e-06
#percent change in parameter estimation
pct_chg.coef <- function(model, modelv, digit){
  coef.m <- coef(model)
  coef.v <- coef(modelv)
  pct.chg.coef <- round(abs(coef.v-coef.m)/abs(coef.m)*100, digit)
  pct.chg.coef
}
pct_chg.coef.bs1 <- pct_chg.coef(best.sub1, best.sub1.v, 2)
pct_chg.coef.bs2 <- pct_chg.coef(best.sub2, best.sub2.v, 2)
pct_chg.coef.sm1 <- pct_chg.coef(step.model1, step.model1.v, 2)
pct_chg.coef.sm2 <- pct_chg.coef(step.model2, step.model2.v, 2)

#percent change in standard error
pct_chg.sd <- function(model, modelv, digit){
  sd.m <- summary(model)$coefficients[, "Std. Error"]
  sd.v <- summary(modelv)$coefficients[, "Std. Error"]
  pct.chg.sd <- round(abs(sd.v-sd.m)/abs(sd.m)*100, digit)
  pct.chg.sd
}
pct_chg.sd.bs1 <- pct_chg.sd(best.sub1, best.sub1.v, 2)
pct_chg.sd.bs2 <- pct_chg.sd(best.sub2, best.sub2.v, 2)
pct_chg.sd.sm1 <- pct_chg.sd(step.model1, step.model1.v, 2)
pct_chg.sd.sm2 <- pct_chg.sd(step.model2, step.model2.v, 2)


pct_chg_beta_bs1 <- data.frame(pct_chg.coef.bs1,pct_chg.sd.bs1) 
pct_chg_beta_bs1 
##                 pct_chg.coef.bs1 pct_chg.sd.bs1
## (Intercept)                11.66         264.96
## AGE                         1.02         231.02
## SEXMALE                   292.67         225.76
## SMOKSTATFORMER            428.87         213.68
## SMOKSTATNEVER              44.25         224.40
## QUETELET                   16.65         227.37
## VITUSENOT OFTEN           125.46         214.91
## VITUSEOFTEN                 2.88         218.36
## FAT                       183.93         227.83
## FIBER                     404.10         136.38
## BETADIET                   93.17         160.10
pct_chg_beta_bs2 <- data.frame(pct_chg.coef.bs2,pct_chg.sd.bs2)
pct_chg_beta_bs2
##                 pct_chg.coef.bs2 pct_chg.sd.bs2
## (Intercept)                 2.08         286.53
## SMOKSTATFORMER            366.16         237.85
## SMOKSTATNEVER              86.99         247.35
## QUETELET                   55.08         240.05
## VITUSENOT OFTEN            29.73         234.51
## VITUSEOFTEN                52.99         236.89
## FAT                        36.94         222.44
## BETADIET                   33.61         193.73
pct_chg_beta_sm1 <- data.frame(pct_chg.coef.sm1,pct_chg.sd.sm1)
pct_chg_beta_sm1
##                 pct_chg.coef.sm1 pct_chg.sd.sm1
## (Intercept)                 6.52         295.25
## QUETELET                   44.09         245.38
## VITUSENOT OFTEN            80.38         233.21
## VITUSEOFTEN                31.00         237.46
## BETADIET                   44.70         206.85
## FAT                       105.81         247.69
## SMOKSTATFORMER            448.97         233.35
## SMOKSTATNEVER             116.91         241.66
## SEXMALE                   227.06         240.82
## AGE                       132.16         241.01
pct_chg_beta_sm2 <- data.frame(pct_chg.coef.sm2,pct_chg.sd.sm2)
pct_chg_beta_sm2
##                                pct_chg.coef.sm2 pct_chg.sd.sm2
## (Intercept)                               15.19         876.07
## CHOLESTEROL                              153.15         338.65
## FIBER                                    716.81        1319.89
## QUETELET                                  12.73         150.40
## VITUSENOT OFTEN                         2121.14         325.17
## VITUSEOFTEN                              177.48         534.40
## BETADIET                                  39.39         142.91
## AGE                                      759.02         243.14
## RETDIET                                 1552.21         423.38
## SMOKSTATFORMER                           205.08        1492.22
## SMOKSTATNEVER                             30.88        1589.37
## VITUSENOT OFTEN:BETADIET                1145.15         148.24
## VITUSEOFTEN:BETADIET                     169.51         120.54
## CHOLESTEROL:RETDIET                     3319.38         114.00
## AGE:RETDIET                              310.93         271.53
## VITUSENOT OFTEN:SMOKSTATFORMER           156.35         284.97
## VITUSEOFTEN:SMOKSTATFORMER                44.23         506.82
## VITUSENOT OFTEN:SMOKSTATNEVER            768.50         298.94
## VITUSEOFTEN:SMOKSTATNEVER                159.69         526.88
## RETDIET:SMOKSTATFORMER                   807.22         375.39
## RETDIET:SMOKSTATNEVER                    936.15         442.05
## CHOLESTEROL:QUETELET                      66.86         228.24
## BETADIET:AGE                              62.98         180.64
## FIBER:SMOKSTATFORMER                     329.39        1393.12
## FIBER:SMOKSTATNEVER                      291.79        1454.28
## CHOLESTEROL:FIBER                       1688.22         346.50
colnames(pct_chg_beta_bs1) <- c("change in coefficient(%)","change in standard deviation(%)" )
colnames(pct_chg_beta_bs2) <- c("change in coefficient(%)","change in standard deviation(%)" )
colnames(pct_chg_beta_sm1) <- c("change in coefficient(%)","change in standard deviation(%)" )
colnames(pct_chg_beta_sm2) <- c("change in coefficient(%)","change in standard deviation(%)" )

#mean squared prediction error
mspe <- function(model, dv, data){
  yhat <- predict(model, newdata = data)
  y <- data[[dv]]
  mean((y-yhat)^2)
}

mspe.bs1 <- mspe(best.sub1, "BETAPLASMA", beta_plasma_v)
mspe.bs2 <- mspe(best.sub2, "BETAPLASMA", beta_plasma_v)
mspe.sm1 <- mspe(step.model1, "BETAPLASMA", beta_plasma_v)
mspe.sm2 <- mspe(step.model2, "BETAPLASMA", beta_plasma_v)

#compare with Pressp/n and SSEp/n 
best.sub1.compare <- c(mspe.bs1,press.bs1/n.t,mse.bs1)
best.sub2.compare <- c(mspe.bs2,press.bs2/n.t,mse.bs2)
step.model1.compare <- c(mspe.sm1,press.sm1/n.t,mse.sm1)
step.model2.compare <- c(mspe.sm2,press.sm2/n.t,mse.sm2)
compare2 <-data.frame(best.sub1.compare, best.sub2.compare, step.model1.compare,step.model2.compare)
compare2 <- as.data.frame(t(compare2))
colnames(compare2) <- c("MSPE", "Press/n", "MSE")
rownames(compare2) <- c("Best Subset1", "Best Subset2", "Stepwise model1","Stepwise model2")
compare2
##                     MSPE   Press/n       MSE
## Best Subset1    58248.69 0.4345493 0.4147975
## Best Subset2    58256.81 0.4382581 0.4241571
## Stepwise model1 58260.92 0.4339174 0.4161211
## Stepwise model2 58228.58 0.5239414 0.4121339

Model diagnostic: Outlying and influential cases

best.sub1_final <- lm(best.sub1, data = beta_plasma)
summary(best.sub1_final)
## 
## Call:
## lm(formula = best.sub1, data = beta_plasma)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5102 -0.3739  0.0314  0.3991  1.9367 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.061e+00  2.783e-01  18.185  < 2e-16 ***
## AGE              6.268e-03  3.007e-03   2.084  0.03795 *  
## SEXMALE         -2.097e-01  1.289e-01  -1.627  0.10474    
## SMOKSTATFORMER   1.996e-01  1.299e-01   1.537  0.12543    
## SMOKSTATNEVER    2.493e-01  1.287e-01   1.938  0.05357 .  
## QUETELET        -3.405e-02  6.768e-03  -5.032 8.33e-07 ***
## VITUSENOT OFTEN  2.581e-01  1.051e-01   2.455  0.01464 *  
## VITUSEOFTEN      2.366e-01  9.538e-02   2.480  0.01367 *  
## FAT             -3.733e-03  1.307e-03  -2.857  0.00457 ** 
## FIBER            2.309e-02  8.984e-03   2.570  0.01065 *  
## BETADIET         6.059e-05  3.109e-05   1.949  0.05224 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7046 on 304 degrees of freedom
## Multiple R-squared:  0.2309, Adjusted R-squared:  0.2056 
## F-statistic: 9.127 on 10 and 304 DF,  p-value: 3.437e-13
anova(best.sub1_final)
## Analysis of Variance Table
## 
## Response: log(BETAPLASMA + 1)
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## AGE         1   3.915  3.9152  7.8853 0.0053062 ** 
## SEX         1   5.320  5.3202 10.7149 0.0011853 ** 
## SMOKSTAT    2   3.583  1.7914  3.6079 0.0282756 *  
## QUETELET    1  16.674 16.6740 33.5816 1.708e-08 ***
## VITUSE      2   5.184  2.5918  5.2200 0.0059027 ** 
## FAT         1   1.422  1.4217  2.8632 0.0916496 .  
## FIBER       1   7.336  7.3362 14.7751 0.0001475 ***
## BETADIET    1   1.886  1.8858  3.7979 0.0522356 .  
## Residuals 304 150.942  0.4965                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(1,1))
plot(best.sub1_final, which=c(1,2))

#check outliers in Y
#n <- nrow(totaldata)
n <- nrow(beta_plasma)
std.d.res <- studres(best.sub1_final)
p.res <- 17
bon_thred <- qt(1-0.1/(2*n), n-p.res)
outlier.Y <- as.vector(which(abs(std.d.res) >= bon_thred))
outlier.Y
## [1] 257
hii <- influence(best.sub1_final)$hat
#check outliers in X
outlier.X <- as.vector(which(hii>(2*p.res/n)))
outlier.X
## [1] 152 225
plot(hii, residuals(best.sub1_final), xlab = "leverage", ylab="residuals")

#influence index plot 
plot(best.sub1_final, which = 4)

best.sub1_final2 <- lm(best.sub1_final, data = beta_plasma[-c(35,39,257),])
fitted.sm2 <- fitted(best.sub1_final)
fitted.sm2.rm <- fitted(best.sub1_final2)
SUM <- sum(abs((fitted.sm2[-c(35,39,257)]-fitted.sm2.rm)/fitted.sm2[-c(35,39,257)]))
SUM <- SUM +abs((fitted.sm2[c(35,39,257)]-predict(step.model2, newdata = beta_plasma[c(35,39,257),]))/fitted.sm2[c(35,39,257)])
per.average <- SUM/n
per.average
##          35          39         257 
## 0.010107249 0.009746335 0.009944555
summary(best.sub1_final2)
## 
## Call:
## lm(formula = best.sub1_final, data = beta_plasma[-c(35, 39, 257), 
##     ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.96774 -0.37675  0.00819  0.39383  1.89819 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      4.881e+00  2.545e-01  19.181  < 2e-16 ***
## AGE              6.502e-03  2.733e-03   2.379  0.01796 *  
## SEXMALE         -3.390e-01  1.194e-01  -2.839  0.00483 ** 
## SMOKSTATFORMER   2.243e-01  1.188e-01   1.888  0.05992 .  
## SMOKSTATNEVER    3.108e-01  1.175e-01   2.646  0.00857 ** 
## QUETELET        -3.095e-02  6.150e-03  -5.033 8.33e-07 ***
## VITUSENOT OFTEN  2.114e-01  9.597e-02   2.203  0.02834 *  
## VITUSEOFTEN      2.428e-01  8.720e-02   2.785  0.00570 ** 
## FAT             -2.813e-03  1.207e-03  -2.331  0.02043 *  
## FIBER            2.485e-02  8.173e-03   3.041  0.00257 ** 
## BETADIET         5.344e-05  2.835e-05   1.885  0.06042 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6389 on 301 degrees of freedom
## Multiple R-squared:  0.2677, Adjusted R-squared:  0.2433 
## F-statistic:    11 on 10 and 301 DF,  p-value: 5.506e-16

The potential influential cases identified previously is the 35th, 39th, and 257th cases, we fit the model without 35th,39th and 257th cases and calculate the average absolute difference in the fitted values as 1.01%, 0.97% and 0.99% respectively. For 35th, 39th and 257th cases, the percentage change on the fitted value with or without the case is very small. Therefore, no case have an unduly large influence on prediction and thus all cases may be retained.

RETPLASMA

Preliminary Model Investigation

transformation to Retplasma

retplasma <- plasma[,-13]
fit.pre <- lm(RETPLASMA~., data = retplasma)
par(mfrow=c(2,2))
plot(fit.pre, which=c(1,2))
boxcox(fit.pre)

hist(log(plasma$RETPLASMA), xlab ="log(RETPLASMA)", main = "Histogram of log(RETPLASMA)")

fit.pre.trans <- lm(log(RETPLASMA)~., data = retplasma)
par(mfrow=c(2,2))
plot(fit.pre.trans, which=c(1,2))
summary(fit.pre.trans)
## 
## Call:
## lm(formula = log(RETPLASMA) ~ ., data = retplasma)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.09081 -0.19304  0.01105  0.21121  0.94285 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      6.138e+00  1.339e-01  45.849   <2e-16 ***
## AGE              4.707e-03  1.469e-03   3.204   0.0015 ** 
## SEXMALE          1.024e-01  6.229e-02   1.644   0.1013    
## SMOKSTATFORMER   1.162e-01  6.128e-02   1.896   0.0589 .  
## SMOKSTATNEVER    1.684e-02  6.079e-02   0.277   0.7820    
## QUETELET         2.193e-05  3.208e-03   0.007   0.9945    
## VITUSENOT OFTEN  3.862e-02  4.975e-02   0.776   0.4382    
## VITUSEOFTEN      2.100e-02  4.554e-02   0.461   0.6451    
## CALORIES         1.506e-04  1.002e-04   1.503   0.1338    
## FAT             -2.694e-03  1.581e-03  -1.705   0.0893 .  
## FIBER           -7.495e-03  5.548e-03  -1.351   0.1778    
## ALCOHOL         -3.140e-03  2.436e-03  -1.289   0.1983    
## CHOLESTEROL     -1.646e-04  2.158e-04  -0.763   0.4461    
## BETADIET        -9.060e-06  1.471e-05  -0.616   0.5383    
## RETDIET         -1.162e-05  3.656e-05  -0.318   0.7507    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3304 on 300 degrees of freedom
## Multiple R-squared:  0.0984, Adjusted R-squared:  0.05632 
## F-statistic: 2.339 on 14 and 300 DF,  p-value: 0.004437

fit full model (training / testing)

set.seed(1024)
index <- sample(1:315, ceiling(315*0.8) , replace = FALSE)
retplasma_t <- retplasma[index,] #training data
retplasma_v <- retplasma[-index,] #validation data 
retplasma_t_full <- lm(log(RETPLASMA)~., data=retplasma_t)
summary(retplasma_t_full)
## 
## Call:
## lm(formula = log(RETPLASMA) ~ ., data = retplasma_t)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.05812 -0.19248  0.00453  0.21521  0.96596 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      6.220e+00  1.466e-01  42.445  < 2e-16 ***
## AGE              4.580e-03  1.648e-03   2.780  0.00588 ** 
## SEXMALE          3.301e-02  7.012e-02   0.471  0.63823    
## SMOKSTATFORMER   8.902e-02  6.885e-02   1.293  0.19728    
## SMOKSTATNEVER   -3.568e-02  6.751e-02  -0.529  0.59760    
## QUETELET        -1.244e-03  3.568e-03  -0.349  0.72774    
## VITUSENOT OFTEN  6.061e-03  5.662e-02   0.107  0.91483    
## VITUSEOFTEN      1.945e-02  5.141e-02   0.378  0.70551    
## CALORIES         1.205e-04  1.122e-04   1.074  0.28398    
## FAT             -2.891e-03  1.776e-03  -1.628  0.10489    
## FIBER           -5.772e-03  6.296e-03  -0.917  0.36016    
## ALCOHOL         -2.855e-03  2.655e-03  -1.075  0.28329    
## CHOLESTEROL      9.716e-06  2.423e-04   0.040  0.96805    
## BETADIET        -1.527e-05  1.714e-05  -0.891  0.37392    
## RETDIET          7.774e-06  3.949e-05   0.197  0.84411    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3366 on 237 degrees of freedom
## Multiple R-squared:  0.0928, Adjusted R-squared:  0.03921 
## F-statistic: 1.732 on 14 and 237 DF,  p-value: 0.05037
par(mfrow=c(2,2))
plot(retplasma_t_full)

## Model Selection ### Model selection approach 1: best subset #### First order model

subset <- regsubsets(log(RETPLASMA)~., data = retplasma_t, nvmax = 12, method ="exhaustive")
sum_sub <- summary(subset)
n.t <- nrow(retplasma_t)
p <- rowSums(sum_sub$which)
SSEp <- sum_sub$rss
R2 <- sum_sub$rsq
Ra2 <- sum_sub$adjr2
Cp <- sum_sub$cp
AIC <- n.t*log(SSEp/n.t)+2*p
BIC <-n.t*log(SSEp/n.t)+log(n.t)*p
criteria <- cbind(sum_sub$which, SSEp, R2, Ra2, Cp, AIC, BIC)

#add the criteria of null model
retplasma_t_null<- lm(log(RETPLASMA)~1, data = retplasma_t)
sse0 <- anova(retplasma_t_null)["Residuals",2]
r0 <- summary(retplasma_t_null)$r.squared
ra0 <- summary(retplasma_t_null)$adj.r.squared
p0 <- 1
cp0 <- (sse0/summary(retplasma_t_full)$sigma^2)-(n.t-2*p0)
aic0 <- n.t*log(sse0/n.t)+2*p0
bic0 <- n.t*log(sse0/n.t)+log(n.t)*p0
null <- c(1,rep(0,14),sse0,r0,ra0,cp0,aic0,bic0)
criteria <- rbind(null, criteria)
criteria <- as.data.frame(criteria)
criteria
##      (Intercept) AGE SEXMALE SMOKSTATFORMER SMOKSTATNEVER QUETELET
## null           1   0       0              0             0        0
## 1              1   1       0              0             0        0
## 2              1   1       0              1             0        0
## 3              1   1       0              1             0        0
## 4              1   1       0              1             0        0
## 5              1   1       0              1             1        0
## 6              1   1       0              1             0        0
## 7              1   1       0              1             0        0
## 8              1   1       0              1             1        0
## 9              1   1       1              1             1        0
## 10             1   1       1              1             1        0
## 11             1   1       1              1             1        1
## 12             1   1       1              1             1        1
##      VITUSENOT OFTEN VITUSEOFTEN CALORIES FAT FIBER ALCOHOL CHOLESTEROL
## null               0           0        0   0     0       0           0
## 1                  0           0        0   0     0       0           0
## 2                  0           0        0   0     0       0           0
## 3                  0           0        0   1     0       0           0
## 4                  0           0        0   1     0       0           0
## 5                  0           0        0   1     0       0           0
## 6                  0           0        1   1     1       1           0
## 7                  0           0        1   1     1       1           0
## 8                  0           0        1   1     1       1           0
## 9                  0           0        1   1     1       1           0
## 10                 0           1        1   1     1       1           0
## 11                 0           1        1   1     1       1           0
## 12                 0           1        1   1     1       1           0
##      BETADIET RETDIET     SSEp         R2        Ra2         Cp       AIC
## null        0       0 29.60318 0.00000000 0.00000000 11.2430086 -537.6699
## 1           0       0 28.29976 0.04402980 0.04020592  1.7405317 -547.0171
## 2           0       0 27.73321 0.06316781 0.05564305 -1.2591400 -550.1132
## 3           0       0 27.31759 0.07720749 0.06604467 -2.9269074 -551.9184
## 4           1       0 27.13028 0.08353489 0.06869335 -2.5798972 -551.6522
## 5           1       0 27.09431 0.08474999 0.06614735 -0.8973332 -549.9866
## 6           0       0 27.05596 0.08604557 0.06366302  0.7642043 -548.3435
## 7           1       0 26.95609 0.08941904 0.06329582  1.8829083 -547.2754
## 8           1       0 26.91770 0.09071602 0.06078075  3.5440815 -545.6346
## 9           1       0 26.89402 0.09151586 0.05772926  5.3351291 -543.8564
## 10          1       0 26.87563 0.09213688 0.05446621  7.1728939 -542.0287
## 11          1       0 26.86247 0.09258152 0.05099151  9.0567336 -540.1521
## 12          1       1 26.85754 0.09274795 0.04719554 11.0132556 -538.1983
##            BIC
## null -534.1405
## 1    -539.9583
## 2    -539.5249
## 3    -537.8006
## 4    -534.0051
## 5    -528.8100
## 6    -523.6375
## 7    -519.0400
## 8    -513.8697
## 9    -508.5621
## 10   -503.2049
## 11   -497.7990
## 12   -492.3158
which.best.sub <- data.frame(
  Ra2 = which.max(criteria$Ra2),
  Cp = which.min(criteria$Cp),
  AIC = which.min(criteria$AIC),
  BIC = which.min(criteria$BIC)
)
which.best.sub
##   Ra2 Cp AIC BIC
## 1   5  4   4   2
rbind(criteria[5,],rbind(criteria[4,],criteria[2,]))
##   (Intercept) AGE SEXMALE SMOKSTATFORMER SMOKSTATNEVER QUETELET VITUSENOT OFTEN
## 4           1   1       0              1             0        0               0
## 3           1   1       0              1             0        0               0
## 1           1   1       0              0             0        0               0
##   VITUSEOFTEN CALORIES FAT FIBER ALCOHOL CHOLESTEROL BETADIET RETDIET     SSEp
## 4           0        0   1     0       0           0        1       0 27.13028
## 3           0        0   1     0       0           0        0       0 27.31759
## 1           0        0   0     0       0           0        0       0 28.29976
##           R2        Ra2        Cp       AIC       BIC
## 4 0.08353489 0.06869335 -2.579897 -551.6522 -534.0051
## 3 0.07720749 0.06604467 -2.926907 -551.9184 -537.8006
## 1 0.04402980 0.04020592  1.740532 -547.0171 -539.9583

By adjusted r squared, the model containing AGE, SMOKSTAT, FAT, BETADIET is the best. By Mallow’s Cp and AIC criterion, the model containing AGE, SMOKSTAT, FAT is the best. By BIC criteria which prefers a smaller model than AIC criteria, the model containing AGE is the best.

fit the result above

#best subset from adjusted r square
best.sub1 <- lm(log(RETPLASMA)~AGE+SMOKSTAT+FAT+BETADIET, data = retplasma_t)
summary(best.sub1)
## 
## Call:
## lm(formula = log(RETPLASMA) ~ AGE + SMOKSTAT + FAT + BETADIET, 
##     data = retplasma_t)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.05877 -0.18077  0.00587  0.21069  0.97001 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     6.223e+00  1.055e-01  58.994  < 2e-16 ***
## AGE             4.354e-03  1.450e-03   3.003  0.00295 ** 
## SMOKSTATFORMER  8.765e-02  6.647e-02   1.319  0.18853    
## SMOKSTATNEVER  -3.641e-02  6.372e-02  -0.571  0.56819    
## FAT            -1.107e-03  6.312e-04  -1.754  0.08063 .  
## BETADIET       -1.792e-05  1.454e-05  -1.232  0.21894    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3319 on 246 degrees of freedom
## Multiple R-squared:  0.08475,    Adjusted R-squared:  0.06615 
## F-statistic: 4.556 on 5 and 246 DF,  p-value: 0.000543
par(mfrow=c(2,2))
plot(best.sub1)

#best subset from Cp and AIC
best.sub2 <- lm(log(RETPLASMA)~AGE+SMOKSTAT+FAT, data = retplasma_t)
summary(best.sub2)
## 
## Call:
## lm(formula = log(RETPLASMA) ~ AGE + SMOKSTAT + FAT, data = retplasma_t)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.10481 -0.18420  0.00993  0.21327  0.98907 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     6.2067638  0.1047904  59.230  < 2e-16 ***
## AGE             0.0042786  0.0014500   2.951  0.00347 ** 
## SMOKSTATFORMER  0.0746569  0.0656984   1.136  0.25691    
## SMOKSTATNEVER  -0.0451414  0.0633872  -0.712  0.47704    
## FAT            -0.0012418  0.0006224  -1.995  0.04711 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3322 on 247 degrees of freedom
## Multiple R-squared:  0.0791, Adjusted R-squared:  0.06418 
## F-statistic: 5.304 on 4 and 247 DF,  p-value: 0.0004097
par(mfrow=c(2,2))
plot(best.sub2)

#best subset from BIC
best.sub3 <- lm(log(RETPLASMA)~AGE+FAT, data = retplasma_t)
summary(best.sub3)
## 
## Call:
## lm(formula = log(RETPLASMA) ~ AGE + FAT, data = retplasma_t)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1580 -0.1800  0.0038  0.2122  1.0648 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.186871   0.096875  63.865  < 2e-16 ***
## AGE          0.004414   0.001442   3.062  0.00244 ** 
## FAT         -0.001013   0.000621  -1.631  0.10408    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3353 on 249 degrees of freedom
## Multiple R-squared:  0.05414,    Adjusted R-squared:  0.04654 
## F-statistic: 7.126 on 2 and 249 DF,  p-value: 0.0009783
par(mfrow=c(2,2))
plot(best.sub3)

Model selection approach 2: Stepwise model selection

First order model

step.sub1 <- stepAIC(retplasma_t_null, scope = list(upper=retplasma_t_full, lower=~1), direction = "both", k = 2, trace = FALSE)
summary(step.sub1) #It's the same result as the one we got from best subset selection from Cp and AIC 
## 
## Call:
## lm(formula = log(RETPLASMA) ~ AGE + SMOKSTAT + FAT, data = retplasma_t)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.10481 -0.18420  0.00993  0.21327  0.98907 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     6.2067638  0.1047904  59.230  < 2e-16 ***
## AGE             0.0042786  0.0014500   2.951  0.00347 ** 
## SMOKSTATFORMER  0.0746569  0.0656984   1.136  0.25691    
## SMOKSTATNEVER  -0.0451414  0.0633872  -0.712  0.47704    
## FAT            -0.0012418  0.0006224  -1.995  0.04711 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3322 on 247 degrees of freedom
## Multiple R-squared:  0.0791, Adjusted R-squared:  0.06418 
## F-statistic: 5.304 on 4 and 247 DF,  p-value: 0.0004097

Second order model

retplasma_t_full2 <- lm(log(RETPLASMA)~.^2, data = retplasma_t)
step.sub2 <- stepAIC(retplasma_t_null, scope = list(upper=retplasma_t_full2, lower=~1), direction = "both", k = 2, trace = FALSE)
summary(step.sub2) 
## 
## Call:
## lm(formula = log(RETPLASMA) ~ AGE + SMOKSTAT + FAT, data = retplasma_t)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.10481 -0.18420  0.00993  0.21327  0.98907 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     6.2067638  0.1047904  59.230  < 2e-16 ***
## AGE             0.0042786  0.0014500   2.951  0.00347 ** 
## SMOKSTATFORMER  0.0746569  0.0656984   1.136  0.25691    
## SMOKSTATNEVER  -0.0451414  0.0633872  -0.712  0.47704    
## FAT            -0.0012418  0.0006224  -1.995  0.04711 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3322 on 247 degrees of freedom
## Multiple R-squared:  0.0791, Adjusted R-squared:  0.06418 
## F-statistic: 5.304 on 4 and 247 DF,  p-value: 0.0004097
# same as the step.sub1

Model Validation

Internal validation of candidate_m1, candidate_m2, and candidate_m3

#test on 3 best models so far
candidate_m1 <- lm(log(RETPLASMA)~AGE+SMOKSTAT+FAT+BETADIET, data = retplasma_t)
candidate_m2 <- lm(log(RETPLASMA) ~ AGE + SMOKSTAT + FAT, data = retplasma_t)
candidate_m3 <- lm(log(RETPLASMA) ~ AGE + FAT, data = retplasma_t)

#find SSE for candidate_m1 and candidate_m2
sse.cm1 <- anova(candidate_m1)["Residuals",2]
sse.cm2 <- anova(candidate_m2)["Residuals",2]
sse.cm3 <- anova(candidate_m3)["Residuals",2]
sse.compare <- c(sse.cm1,sse.cm2,sse.cm3)

#find MSE for candidate_m1 and candidate_m2
mse.cm1 <- anova(candidate_m1)["Residuals",3]
mse.cm2 <- anova(candidate_m2)["Residuals",3]
mse.cm3 <- anova(candidate_m3)["Residuals",3]
mse.compare <- c(mse.cm1,mse.cm2, mse.cm3)

#find p for candidate_m1 and candidate_m2
p.cm1 <- 5
p.cm2 <- 4
p.cm3 <- 3
p.compare <- c(p.cm1,p.cm2, p.cm3)

#find Cp for candidate_m1 and candidate_m2
#sigma^2: MSE_fullmodel
#n.t: nrow(training data)
mse_full <- summary(retplasma_t_full)$sigma^2
cp.cm1 <- (sse.cm1/mse_full)-(n.t-2*p.cm1)
cp.cm2 <- (sse.cm2/mse_full)-(n.t-2*p.cm2)
cp.cm3 <- (sse.cm3/mse_full)-(n.t-2*p.cm3)
cp.compare <- c(cp.cm1,cp.cm2,cp.cm3)

#find Pressp for candidate_m1 and candidate_m2
press.cm1 <- sum(candidate_m1$residuals^2/(1-influence(candidate_m1)$hat)^2)
press.cm2 <- sum(candidate_m2$residuals^2/(1-influence(candidate_m2)$hat)^2)
press.cm3 <- sum(candidate_m3$residuals^2/(1-influence(candidate_m3)$hat)^2)
press.compare <- c(press.cm1,press.cm2,press.cm3)

compare <- data.frame(sse=sse.compare, mse=mse.compare, p=p.compare, cp=cp.compare, press=press.compare)
rownames(compare) <- c("candidate_m1","candidate_m2","candidate_m3")
compare
##                   sse       mse p        cp    press
## candidate_m1 27.09431 0.1101395 5 -2.897333 28.65852
## candidate_m2 27.26162 0.1103709 4 -3.420886 28.40186
## candidate_m3 28.00048 0.1124517 3  1.099474 28.67201

External validation

##candidate_m1 on validation data
candidate_m1.v <- lm(candidate_m1, data = retplasma_v)
candidate_m2.v <- lm(candidate_m2, data = retplasma_v)
candidate_m3.v <- lm(candidate_m3, data = retplasma_v)
#summary on training data and validation data respectively
list(summary(candidate_m1), summary(candidate_m2), summary(candidate_m3))
## [[1]]
## 
## Call:
## lm(formula = log(RETPLASMA) ~ AGE + SMOKSTAT + FAT + BETADIET, 
##     data = retplasma_t)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.05877 -0.18077  0.00587  0.21069  0.97001 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     6.223e+00  1.055e-01  58.994  < 2e-16 ***
## AGE             4.354e-03  1.450e-03   3.003  0.00295 ** 
## SMOKSTATFORMER  8.765e-02  6.647e-02   1.319  0.18853    
## SMOKSTATNEVER  -3.641e-02  6.372e-02  -0.571  0.56819    
## FAT            -1.107e-03  6.312e-04  -1.754  0.08063 .  
## BETADIET       -1.792e-05  1.454e-05  -1.232  0.21894    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3319 on 246 degrees of freedom
## Multiple R-squared:  0.08475,    Adjusted R-squared:  0.06615 
## F-statistic: 4.556 on 5 and 246 DF,  p-value: 0.000543
## 
## 
## [[2]]
## 
## Call:
## lm(formula = log(RETPLASMA) ~ AGE + SMOKSTAT + FAT, data = retplasma_t)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.10481 -0.18420  0.00993  0.21327  0.98907 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     6.2067638  0.1047904  59.230  < 2e-16 ***
## AGE             0.0042786  0.0014500   2.951  0.00347 ** 
## SMOKSTATFORMER  0.0746569  0.0656984   1.136  0.25691    
## SMOKSTATNEVER  -0.0451414  0.0633872  -0.712  0.47704    
## FAT            -0.0012418  0.0006224  -1.995  0.04711 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3322 on 247 degrees of freedom
## Multiple R-squared:  0.0791, Adjusted R-squared:  0.06418 
## F-statistic: 5.304 on 4 and 247 DF,  p-value: 0.0004097
## 
## 
## [[3]]
## 
## Call:
## lm(formula = log(RETPLASMA) ~ AGE + FAT, data = retplasma_t)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1580 -0.1800  0.0038  0.2122  1.0648 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.186871   0.096875  63.865  < 2e-16 ***
## AGE          0.004414   0.001442   3.062  0.00244 ** 
## FAT         -0.001013   0.000621  -1.631  0.10408    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3353 on 249 degrees of freedom
## Multiple R-squared:  0.05414,    Adjusted R-squared:  0.04654 
## F-statistic: 7.126 on 2 and 249 DF,  p-value: 0.0009783
list(summary(candidate_m1.v), summary(candidate_m2.v), summary(candidate_m3.v))
## [[1]]
## 
## Call:
## lm(formula = candidate_m1, data = retplasma_v)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.83435 -0.18186  0.01204  0.19784  0.88838 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     5.754e+00  2.321e-01  24.791   <2e-16 ***
## AGE             7.766e-03  3.191e-03   2.434   0.0181 *  
## SMOKSTATFORMER  3.096e-01  1.427e-01   2.169   0.0343 *  
## SMOKSTATNEVER   2.459e-01  1.404e-01   1.751   0.0853 .  
## FAT             3.784e-04  1.319e-03   0.287   0.7752    
## BETADIET       -1.736e-06  2.848e-05  -0.061   0.9516    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3099 on 57 degrees of freedom
## Multiple R-squared:  0.1619, Adjusted R-squared:  0.08836 
## F-statistic: 2.202 on 5 and 57 DF,  p-value: 0.06655
## 
## 
## [[2]]
## 
## Call:
## lm(formula = candidate_m2, data = retplasma_v)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8317 -0.1808  0.0135  0.1991  0.8911 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.7540061  0.2300812  25.009   <2e-16 ***
## AGE            0.0077207  0.0030743   2.511   0.0148 *  
## SMOKSTATFORMER 0.3090616  0.1411968   2.189   0.0326 *  
## SMOKSTATNEVER  0.2446627  0.1377798   1.776   0.0810 .  
## FAT            0.0003715  0.0013026   0.285   0.7765    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3072 on 58 degrees of freedom
## Multiple R-squared:  0.1618, Adjusted R-squared:  0.104 
## F-statistic:   2.8 on 4 and 58 DF,  p-value: 0.03404
## 
## 
## [[3]]
## 
## Call:
## lm(formula = candidate_m3, data = retplasma_v)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8036 -0.2095  0.0399  0.1953  0.8815 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6.027e+00  1.948e-01  30.941   <2e-16 ***
## AGE         7.663e-03  3.128e-03   2.449   0.0172 *  
## FAT         4.376e-05  1.322e-03   0.033   0.9737    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3143 on 60 degrees of freedom
## Multiple R-squared:  0.09255,    Adjusted R-squared:  0.06231 
## F-statistic:  3.06 on 2 and 60 DF,  p-value: 0.05428
#percent change in parameter estimation
pct_chg.coef <- function(model, modelv, digit){
  coef.m <- coef(model)
  coef.v <- coef(modelv)
  pct.chg.coef <- round(abs(coef.v-coef.m)/abs(coef.m)*100, digit)
  pct.chg.coef
}

pct_chg.coef.cm1 <- pct_chg.coef(candidate_m1, candidate_m1.v, 2)
pct_chg.coef.cm2 <- pct_chg.coef(candidate_m2, candidate_m2.v, 2)
pct_chg.coef.cm3 <- pct_chg.coef(candidate_m3, candidate_m3.v, 2)

#percent change in standard error
pct_chg.sd <- function(model, modelv, digit){
  sd.m <- summary(model)$coefficients[, "Std. Error"]
  sd.v <- summary(modelv)$coefficients[, "Std. Error"]
  pct.chg.sd <- round(abs(sd.v-sd.m)/abs(sd.m)*100, digit)
  pct.chg.sd
}
pct_chg.sd.cm1 <- pct_chg.sd(candidate_m1, candidate_m1.v, 2)
pct_chg.sd.cm2 <- pct_chg.sd(candidate_m2, candidate_m2.v, 2)
pct_chg.sd.cm3 <- pct_chg.sd(candidate_m3, candidate_m3.v, 2)

pct_chg_beta_cm1 <- data.frame(pct_chg.coef.cm1,pct_chg.sd.cm1) 
pct_chg_beta_cm1 
##                pct_chg.coef.cm1 pct_chg.sd.cm1
## (Intercept)                7.53         120.05
## AGE                       78.38         120.08
## SMOKSTATFORMER           253.28         114.75
## SMOKSTATNEVER            775.24         120.34
## FAT                      134.18         108.93
## BETADIET                  90.32          95.87
pct_chg_beta_cm2 <- data.frame(pct_chg.coef.cm2,pct_chg.sd.cm2)
pct_chg_beta_cm2
##                pct_chg.coef.cm2 pct_chg.sd.cm2
## (Intercept)                7.29         119.56
## AGE                       80.45         112.03
## SMOKSTATFORMER           313.98         114.92
## SMOKSTATNEVER            641.99         117.36
## FAT                      129.92         109.30
pct_chg_beta_cm3 <- data.frame(pct_chg.coef.cm3,pct_chg.sd.cm3)
pct_chg_beta_cm3
##             pct_chg.coef.cm3 pct_chg.sd.cm3
## (Intercept)             2.58         101.08
## AGE                    73.59         116.96
## FAT                   104.32         112.85
colnames(pct_chg_beta_cm1) <- c("change in coefficient(%)","change in standard deviation(%)" )
colnames(pct_chg_beta_cm2) <- c("change in coefficient(%)","change in standard deviation(%)" )
colnames(pct_chg_beta_cm3) <- c("change in coefficient(%)","change in standard deviation(%)" )

#mean squared prediction error
mspe <- function(model, dv, data){
  yhat <- predict(model, newdata = data)
  y <- data[[dv]]
  mean((y-yhat)^2)
}

mspe.cm1 <- mspe(candidate_m1, "RETPLASMA", retplasma_v)
mspe.cm2 <- mspe(candidate_m2, "RETPLASMA", retplasma_v)
mspe.cm3 <- mspe(candidate_m3, "RETPLASMA", retplasma_v)

#compare with Pressp/n and SSEp/n 
candidate_m1.compare <- c(mspe.cm1,press.cm1/n.t,mse.cm1)
candidate_m2.compare <- c(mspe.cm2,press.cm2/n.t,mse.cm2)
candidate_m3.compare <- c(mspe.cm3,press.cm3/n.t,mse.cm3)

compare2 <-data.frame(candidate_m1.compare, candidate_m2.compare, candidate_m3.compare)
compare2 <- as.data.frame(t(compare2))
colnames(compare2) <- c("MSPE", "Press/n", "MSE")
rownames(compare2) <- c("candidate_m1","candidate_m2","candidate_m3")
compare2
##                  MSPE   Press/n       MSE
## candidate_m1 435048.0 0.1137243 0.1101395
## candidate_m2 435048.2 0.1127058 0.1103709
## candidate_m3 435050.0 0.1137778 0.1124517

Choose candidate_m2: Age, Smokstat, Fat

Model diagnostic: Outlying and influential cases

candidate_m2_final <- lm(candidate_m2, data = retplasma)
summary(candidate_m2_final)
## 
## Call:
## lm(formula = candidate_m2, data = retplasma)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1368 -0.1846  0.0036  0.2103  0.9818 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     6.1547933  0.0949166  64.844  < 2e-16 ***
## AGE             0.0045494  0.0013053   3.485 0.000562 ***
## SMOKSTATFORMER  0.1079461  0.0593416   1.819 0.069866 .  
## SMOKSTATNEVER   0.0031563  0.0575199   0.055 0.956275    
## FAT            -0.0010254  0.0005617  -1.826 0.068887 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3289 on 310 degrees of freedom
## Multiple R-squared:  0.077,  Adjusted R-squared:  0.06509 
## F-statistic: 6.465 on 4 and 310 DF,  p-value: 5.228e-05
anova(candidate_m2_final)
## Analysis of Variance Table
## 
## Response: log(RETPLASMA)
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## AGE         1  1.720 1.71983 15.8984 8.341e-05 ***
## SMOKSTAT    2  0.717 0.35854  3.3144   0.03765 *  
## FAT         1  0.360 0.36049  3.3325   0.06889 .  
## Residuals 310 33.535 0.10818                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(1,1))
plot(candidate_m2_final, which=c(1,2))

#check outliers in Y
#n <- nrow(totaldata)
n <- nrow(retplasma)
std.d.res <- studres(candidate_m2_final)
p.res <- 4
bon_thred <- qt(1-0.1/(2*n), n-p.res)
outlier.Y <- as.vector(which(abs(std.d.res) >= bon_thred))
outlier.Y
## integer(0)
hii <- influence(candidate_m2_final)$hat
#check outliers in X
outlier.X <- as.vector(which(hii>(2*p.res/n)))
outlier.X
##  [1]  32  33  42  44  50  62  68  75  82  88  89  95 100 101 109 111 122 124 134
## [20] 144 152 170 180 181 195 202 212 215 220 222 223 239 257 266 269 274 280 282
## [39] 289 290 296 302 305 308
plot(hii, residuals(candidate_m2_final), xlab = "leverage", ylab="residuals")

#influence index plot 
plot(candidate_m2_final, which = 4)

candidate_m2_final2 <- lm(candidate_m2_final, data = retplasma[-c(36,296),])
f1 <- fitted(candidate_m2_final)
f2 <- fitted(candidate_m2_final2)
SUM<-sum(abs((f1[-c(36,296)]-f2)/f1[-c(36,296)]))
SUM<-SUM+abs((f1[c(36,296)]-predict(candidate_m2_final,newdata = retplasma[c(36,296),]))/f1[c(36,296)])
per.average <- SUM/n
per.average
##           36          296 
## 0.0004030076 0.0004030076
summary(candidate_m2)
## 
## Call:
## lm(formula = log(RETPLASMA) ~ AGE + SMOKSTAT + FAT, data = retplasma_t)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.10481 -0.18420  0.00993  0.21327  0.98907 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     6.2067638  0.1047904  59.230  < 2e-16 ***
## AGE             0.0042786  0.0014500   2.951  0.00347 ** 
## SMOKSTATFORMER  0.0746569  0.0656984   1.136  0.25691    
## SMOKSTATNEVER  -0.0451414  0.0633872  -0.712  0.47704    
## FAT            -0.0012418  0.0006224  -1.995  0.04711 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3322 on 247 degrees of freedom
## Multiple R-squared:  0.0791, Adjusted R-squared:  0.06418 
## F-statistic: 5.304 on 4 and 247 DF,  p-value: 0.0004097

The potential influential cases identified previously is the 36th and 296th cases, we fit the model without 36th and 296th cases and calculate the average absolute difference in the fitted values as 0.040% respectively. For 36th and 296th cases, the percentage change on the fitted value with or without the case is very small. Therefore, no case have an unduly large influence on prediction and thus all cases may be retained.